clear all
set more off
set mem 10000000
set matsize 10000

*****************************************************************************************
*** BUILD ECONOMIC CENSUS ***
*****************************************************************************************

** Set file paths
do "$path_code/paths.do"

/*
******************************************************************************
* Description
******************************************************************************

* Merge: create panel of EC data across years
* For each PCA 01 village and EC year, the panel will 2001 PCA identifiers,
* the shrid, EC employees, and EC firms

* Author: Garrison Schlauch
* Last updated: November 11th, 2020

* Notes: 
// 1) if an administrative unit is not later combined with another, it will correspond to a single shrug id. However, if 2 or more administrative units are combined in later years, they will have the same shrug id

* Inputs:
// village_nic*.dta: economic census data for 1990, 1998, 2005, and 2013 
// shrug_ec[year]r_key.dta: SHRUG <---> EC crosswalks
// shrug_pca01_xwalk.dta: SHRUG <---> 2001 PCA crosswalk
// panel_dataset_full.dta: 2001 PCA crosswalk <---> 2001 PCA ids

* Outputs:
// ec_shrid_pc01_panel.dta: pca01_id level panel across EC years with employment and firm counts

* User packages:
// gtools (for collapsing and identifying duplicates)
// ftools (for merging)

*/

******************************************************************************
* Clear temporary directory
******************************************************************************

cd "$path/data/temp"
local files: dir . files "*.dta"

foreach file of local files {
	erase `file'
}

******************************************************************************
* Loop through EC years, merging EC-shrug crosswalk with non-shrug EC data
******************************************************************************

* Macro to loop through EC years
local ec_yrs 90 98 05 13

* Macro to reference EC identifiers
local ec_ids ec_state_id ec_district_id ec_subdistrict_id ///
	ec_village_id
	
* Macro for renaming and destringing EC identifiers
local shrug_ec_ids state_id district_id subdistrict_id village_id
			
* Loop through years
foreach ec_yr of local ec_yrs {
	
	* Get the full year and EC identifier list for each EC wave
	if `ec_yr' == 90 {
		local ec_yr_full = 1990
	}
	if `ec_yr' == 98 {
		local ec_yr_full = 1998
	}
	if `ec_yr' == 05 {
		local ec_yr_full = 2005
	}
	if `ec_yr' == 13 {
		local ec_yr_full = 2013
	}
	
	* Open non-shrug EC data
	use "$ec/village_nic`ec_yr_full'.dta", clear
	
	** Clean and rename identifiers
	if `ec_yr' == 90 {
		* Rename
		rename state_code ec_state_id
		rename district_code ec_district_id
		rename admin_div ec_subdistrict_id
		rename villagetown ec_village_id
	}

	if `ec_yr' == 98 | `ec_yr' == 05 | `ec_yr' == 13 {
		* Rename
		rename state_code ec_state_id
		rename district_code ec_district_id
		rename tehsil_code ec_subdistrict_id
		rename villagetown ec_village_id
		
		* Clean
		if `ec_yr' == 98 {
			* Drop row with ��� tehsil code that won't render in stata
			drop if id == "020601500110" & ec_subdistrict_id != "0"
		}
	}
	
	* Drop any observations with missing identifiers
	foreach nonshrug_ec_id of local nonshrug_ec_ids {
		drop if missing(`nonshrug_ec_id')
	}
	
	* Destring EC identifiers if they are not already
	foreach ec_id of local ec_ids {
		destring `ec_id', replace
	}
	
	* Check that no firm or employee data is missing
	assert !missing(firms) & !missing(employees)
	
	* Drop industries that are not common across ECs. For example, 
	* firms in government and national defense firms were not included 
	* in the 2013 EC. According to SHRUG, firms in agriculture are counted
	* differently across years, so we drop them as well:
	* see v1.4 codebook: https://github.com/devdatalab/SHRUG/releases
	
	* Note: for EC 1990 and 1998, many nic codes are not divisble by 10, while
	* only those divisble by 10 are in the codebook. Based on their
	* classification under broadactivity, these nic codes appear to fall under
	* the floor nic code that is divisble by 10. For example, 0001 would
	* fall under 0000, 9033 would fall under 9030, etc
	
	if `ec_yr' == 90 {
		* Note: there are non-numeric nic_codes, so these must be listed
		* individually
	// http://microdata.gov.in/nada43/index.php/catalog/142/related_materials
		
		* Gov admin & defense
		drop if nic_code == "9000" | nic_code == "9001" | ///
			nic_code == "9002" | nic_code == "9003" | nic_code == "9010" | ///
			nic_code == "9011" | nic_code == "9012" | nic_code == "9013" | ///
			nic_code == "9014" | nic_code == "9016" | nic_code == "9018" | ///
			nic_code == "9019" | nic_code == "9020" | nic_code == "9021" | ///
			nic_code == "9022" | nic_code == "9023" | nic_code == "9030" | ///
			nic_code == "9031" | nic_code == "9026"

		* Agriculture
		drop if nic_code == "0000" | nic_code == "0001" | ///
			nic_code == "0002" | nic_code == "0003" | nic_code == "0004" | ///
			nic_code == "0005" | nic_code == "0006" | nic_code == "0007" | ///
			nic_code == "0008" | nic_code == "0009" | nic_code == "0010" | ///
			nic_code == "0013" | nic_code == "0014" | nic_code == "0018" | ///
			nic_code == "0021" | nic_code == "0022" | nic_code == "0034" | ///
			nic_code == "0042" | nic_code == "0049" | nic_code == "0050" | ///
			nic_code == "0051" | nic_code == "0053" | nic_code == "0057" | ///
			nic_code == "0061" | nic_code == "0063" | nic_code == "0067" | ///
			nic_code == "0069" | nic_code == "0070" | nic_code == "0090" | ///
			nic_code == "0091" | nic_code == "0100" | nic_code == "0110" | ///
			nic_code == "0120" | nic_code == "0132" | nic_code == "0151" | ///
			nic_code == "0152" | nic_code == "0161" | nic_code == "0172" | ///
			nic_code == "0179" | nic_code == "0191" | nic_code == "0199" | ///
			nic_code == "0200" | nic_code == "0201" | nic_code == "0202" | ///
			nic_code == "0204" | nic_code == "0207" | nic_code == "0209" | ///
			nic_code == "0210" | nic_code == "0212" | nic_code == "0220" | ///
			nic_code == "0222" | nic_code == "0230" | nic_code == "0235" | ///
			nic_code == "0240" | nic_code == "0242" | nic_code == "0246" | ///
			nic_code == "0250" | nic_code == "0251" | nic_code == "0259" | ///
			nic_code == "0260" | nic_code == "0270" | nic_code == "0272" | ///
			nic_code == "0275" | nic_code == "0276" | nic_code == "0277" | ///
			nic_code == "0279" | nic_code == "0290" | nic_code == "0291" | ///
			nic_code == "0299" | nic_code == "0300" | nic_code == "0301" | ///
			nic_code == "0310" | nic_code == "0311" | nic_code == "0320" | ///
			nic_code == "0329" | nic_code == "0330" | nic_code == "0339" | ///
			nic_code == "0340" | nic_code == "0341" | nic_code == "0343" | ///
			nic_code == "0350" | nic_code == "036-" | nic_code == "0360" | ///
			nic_code == "0370" | nic_code == "0390" | nic_code == "0391" | ///
			nic_code == "0395" | nic_code == "0399" | nic_code == "0400" | ///
			nic_code == "0410" | nic_code == "0422" | nic_code == "0440" | ///
			nic_code == "0500" | nic_code == "0501" | nic_code == "0502" | ///
			nic_code == "0503" | nic_code == "0510" | nic_code == "0511" | ///
			nic_code == "0512" | nic_code == "0515" | nic_code == "0519" | ///
			nic_code == "0520" | nic_code == "0530" | nic_code == "0531" | ///
			nic_code == "0540" | nic_code == "0550" | nic_code == "0560" | ///
			nic_code == "0570" | nic_code == "0590" | nic_code == "0599" | ///
			nic_code == "0600" | nic_code == "0601" | nic_code == "0610" | ///
			nic_code == "0611" | nic_code == "0612" | nic_code == "0616" | ///
			nic_code == "0619" | nic_code == "0620" | nic_code == "0629" | ///
			nic_code == "0630" | nic_code == "0639" | nic_code == "0640" | ///
			nic_code == "0650" | nic_code == "0660" | nic_code == "0690" | ///
			nic_code == "0699"
	}
	
	if `ec_yr' == 98 {
		* Note: there are non-numeric nic_codes, so these must be listed
		* individually
		
		* Gov admin & defense
		drop if nic_code == "9000" | nic_code == "9001" | ///
			nic_code == "9002" | nic_code == "9003" | nic_code == "9010" | ///
			nic_code == "9011" | nic_code == "9012" | nic_code == "9013" | ///
			nic_code == "9014" | nic_code == "9016" | nic_code == "9018" | ///
			nic_code == "9019" | nic_code == "9020" | nic_code == "9021" | ///
			nic_code == "9022" | nic_code == "9023" | nic_code == "9030" | ///
			nic_code == "9031" | nic_code == "9026"

		* Agriculture
		drop if nic_code == "0200" | nic_code == "0201" | ///
			nic_code == "0202" | nic_code == "0208" | nic_code == "0210" | ///
			nic_code == "0212" | nic_code == "0220" | nic_code == "0221" | ///
			nic_code == "0222" | nic_code == "0230" | nic_code == "0233" | ///
			nic_code == "0237" | nic_code == "0240" | nic_code == "0241" | ///
			nic_code == "0242" | nic_code == "0250" | nic_code == "0251" | ///
			nic_code == "0252" | nic_code == "0260" | nic_code == "0270" | ///
			nic_code == "0290" | nic_code == "0294" | nic_code == "03" | ///
			nic_code == "0300" | nic_code == "0302" | nic_code == "0310" | ///
			nic_code == "0315" | nic_code == "0320" | nic_code == "0330" | ///
			nic_code == "0339" | nic_code == "0340" | nic_code == "0350" | ///
			nic_code == "0360" | nic_code == "0370" | nic_code == "0390" | ///
			nic_code == "0391" | nic_code == "0392" | nic_code == "0394" | ///
			nic_code == "0399" | nic_code == "0400" | nic_code == "0402" | ///
			nic_code == "0410" | nic_code == "0412" | nic_code == "0430" | ///
			nic_code == "0490" | nic_code == "0500" | nic_code == "0501" | ///
			nic_code == "0502" | nic_code == "0503" | nic_code == "0510" | ///
			nic_code == "0512" | nic_code == "0513" | nic_code == "0520" | ///
			nic_code == "0522" | nic_code == "0530" | nic_code == "0540" | ///
			nic_code == "0550" | nic_code == "0590" | nic_code == "0600" | ///
			nic_code == "0601" | nic_code == "0602" | nic_code == "0610" | ///
			nic_code == "0620" | nic_code == "0623" | nic_code == "0630" | ///
			nic_code == "0640" | nic_code == "0650" | nic_code == "0660" | ///
			nic_code == "0661" | nic_code == "0687" | nic_code == "0690" | ///
			nic_code == "0000" | nic_code == "0001" | nic_code == "0003" | ///
			nic_code == "0010" | nic_code == "0012" | nic_code == "0013" | ///
			nic_code == "0020" | nic_code == "0023" | nic_code == "0025" | ///
			nic_code == "0041" | nic_code == "0043" | nic_code == "0049" | ///
			nic_code == "0050" | nic_code == "0061" | nic_code == "0063" | ///
			nic_code == "0069" | nic_code == "0070" | nic_code == "0091" | ///
			nic_code == "0095" | nic_code == "0096" | nic_code == "0099" | ///
			nic_code == "0100" | nic_code == "0102" | nic_code == "0110" | ///
			nic_code == "0111" | nic_code == "0151" | nic_code == "0152" | ///
			nic_code == "0170" | nic_code == "0171" | nic_code == "0172" | ///
			nic_code == "0174" | nic_code == "0175" | nic_code == "0179" | ///
			nic_code == "0199"
	}
	
	if `ec_yr' == 05 {
		* Gov admin & defense
		drop if nic_code >= 7511 & nic_code <= 7530
		
		* Agriculture
		drop if nic_code < 1000
	}
	
	if `ec_yr' == 13 {
		* Gov admin & defense
		// NA
		
		* Agriculture
		drop if nic_code <= 39
	}
	
	* Collapse to village level rather than village-nic level (or
	* village-nic-other_identifier level in the case of 1998) 
	gcollapse (sum) employees firm, by(`ec_ids')
	
	* Compress to save space
	compress *

	* Temporarily save
	save "$path/data/temp/nonshrug_ec_`ec_yr'_village", replace

	* Load EC crosswalk for each year
	if `ec_yr' == 90 | `ec_yr' == 98 | `ec_yr' == 13 {
		use "$shrug/shrug-v1.4.samosa-keys-dta/shrug_ec`ec_yr'r_key.dta", clear
	}
	
	if `ec_yr' == 05 {
		use "$shrug/shrug-v1.4.samosa-keys-dta/shrug_ec`ec_yr'r_key_wdist_subdist_ids.dta", clear
	}
		
	* Loop through village identifiers in the shrug
	foreach shrug_ec_id of local shrug_ec_ids {
		
		* Rename EC village identifiers to be non-year dependent
		rename ec`ec_yr'_`shrug_ec_id' ec_`shrug_ec_id'
		
		* Destring EC identifiers
		destring ec_`shrug_ec_id', replace
	}
	
	* Merge non-shrug EC data onto shrug <---> EC crosswalk
	merge 1:1 `ec_ids' using ///
		"$path/data/temp/nonshrug_ec_`ec_yr'_village"

	* Drop unmatched EC villages, ie those without a shrug id
	drop if _merge == 2

	* Drop villages in the shrug-EC crosswalk not in the EC data
	* Note that this only happens for 2005 with the full list of nic codes,
	* and for every state without gov admin, defense, or agricultural firms
	drop if _merge == 1
	
	* Collapse to shrug-id level
	gcollapse (sum) employees firms, by(shrid)
	
	* Indicate year
	gen ec_year = `ec_yr_full'
	
	* Merge onto 2001 PCA identifiers, only keeping villages that
	* match since the rest of the data is at the 2001 PCA level, hence
	* data not at that level is not useful
	merge 1:m shrid using "$shrug/shrug_pca01_xwalk.dta", ///
		keep(3) nogen keepusing(st_code dt_code ta_code vi_code)

	* Check that there are no duplicate 2001 PCA village identifiers
	unique st_code dt_code ta_code vi_code
	assert r(N) == r(unique)

	* Merge in PCA 2001 unique identifiers
	merge 1:m st_code dt_code vi_code using ///
	"$panel/panel_dataset_full.dta", keepusing(pca01_id) keep(3) nogen

	* Count duplicate shrids
	gduplicates tag shrid, gen(count_dup_shrid_ec)
	
	* Count duplicate villages
	gduplicates tag st_code dt_code vi_code, gen(count_dup_vill_ec)
	
	* Check 2001 PCA ids are unique
	unique pca01_id
	assert r(unique)==r(N)
	
	* Temporarily save:
	* A shrug id-employee-firm panel will be constructed at the end
	save "$path/data/temp/shrug_nonshrug_ec_`ec_yr'_village", replace
}

******************************************************************************
* Construct shrug id-employee-firm panel
******************************************************************************

clear

* List EC rouds to be appended
cd "$path/data/temp"
local files: dir . files "shrug_nonshrug*.dta"

* Append data
foreach file of local files {
	append using `file'
}

* Rename variables
rename employees ec_employees
rename firms ec_firms

* Flag villages with more firms than employees
gen flag_ec_firms_gtr_ec_emps = 0
replace flag_ec_firms_gtr_ec_emps = 1 if ec_firms > ec_employees & ///
	!missing(ec_employees) & !missing(ec_firms)


* Label variables
la var shrid "SHRUG identifier"
la var ec_year "Economic Census year"
la var ec_firms "EC # of firms"
la var ec_employees "EC # of employees"
la var flag_ec_firms_gtr_ec_emps "== 1 if the # of EC firms > # of EC employees"
la var count_dup_vill_ec "# of duplicate PCA villages by EC year"
la var count_dup_shrid_ec "# of duplicate shrids by EC year"

* Order and sort data
order pca01_id st_code dt_code ta_code vi_code shrid ec_year ///
	ec_employees ec_firms count_dup_vill_ec count_dup_shrid_ec ///
	flag_ec_firms_gtr_ec_emps

sort pca01_id st_code dt_code ta_code vi_code shrid ec_year ///
	ec_employees ec_firms
	
compress *
 
* Save
save "$panel/ec_shrid_pc01_panel.dta", replace

******************************************************************************
* Clear temporary directory
******************************************************************************

cd "$path/data/temp"
local files: dir . files "*.dta"

foreach file of local files {
	erase `file'
}

